Load some basic libraries to start with

library(formattable)
library(data.table)
library(stringr)
library(ggmap)
library(tmaptools)
library(dplyr)
library(lubridate)
library(ggplot2)
library(ggrepel)
library(tidyverse)
library(osmdata)
library(sf)
library(rgeos)
library(rgdal)
library(geojsonsf)

Data Cleaning

manhattan <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/manhattan.csv", stringsAsFactors=FALSE)
formattable(head(manhattan))
BOROUGH NEIGHBORHOOD BUILDING.CLASS.CATEGORY TAX.CLASS.AT.PRESENT BLOCK LOT EASE.MENT BUILDING.CLASS.AT.PRESENT ADDRESS APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE
1 ALPHABET CITY 01 ONE FAMILY DWELLINGS 1 376 43 NA S1 743 EAST 6TH STREET 10009 1 1 2 2,090 3,680 1940 1 S1 3,200,000 7/24/19
1 ALPHABET CITY 01 ONE FAMILY DWELLINGS 1 390 61 NA A4 189 EAST 7TH STREET 10009 1 0 1 987 2,183 1860 1 A4 0 9/25/19
1 ALPHABET CITY 02 TWO FAMILY DWELLINGS 1 404 1 NA B9 166 AVENUE A 10009 2 0 2 1,510 4,520 1900 1 B9 0 7/22/19
1 ALPHABET CITY 03 THREE FAMILY DWELLINGS 1 377 56 NA C0 263 EAST 7TH STREET 10009 3 0 3 2,430 3,600 1899 1 C0 6,300,000 4/30/19
1 ALPHABET CITY 03 THREE FAMILY DWELLINGS 1 393 9 NA C0 604 EAST 11TH STREET 10009 3 0 3 2,375 5,110 1939 1 C0 0 10/24/19
1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS 2 372 23 NA C1 300 EAST 3RD STREET 10009 12 0 12 2,393 7,989 2001 2 C1 1,950,000 8/8/19

Remove unimportant columns

drops <- c("BOROUGH","EASE.MENT", "TAX.CLASS.AT.PRESENT", "BUILDING.CLASS.AT.PRESENT", "APARTMENT.NUMBER")
manhattan <- manhattan[ , !(names(manhattan) %in% drops)]

Keep only rows for family dwellings

manhattan <- manhattan[manhattan$BUILDING.CLASS.CATEGORY %like% "FAMILY", ]

Keep only rows that have non-zero values for sale price

manhattan <-manhattan[!(manhattan$"SALE.PRICE"== 0),]

replace_commas<-function(x){
  x<-as.numeric(gsub("\\,", "", x))
}

manhattan$SALE.PRICE <- replace_commas(manhattan$SALE.PRICE)
manhattan$SALE.PRICE <- as.numeric(manhattan$SALE.PRICE)

Geocoding

Clean addresses

# Convert addresses to type character
manhattan$ADDRESS <- as.character(manhattan$ADDRESS)

# Remove anything before a dash, before a slash, and after a comma
manhattan$ADDRESS <- gsub(".*-","", manhattan$ADDRESS)
manhattan$ADDRESS <- gsub(".*/","", manhattan$ADDRESS)
manhattan$ADDRESS <- gsub("(.*),.*", "\\1", manhattan$ADDRESS)

# Remove rows with address lengths less than 3 
manhattan$address_length <- c(str_count(manhattan$ADDRESS))
manhattan <- manhattan[!(manhattan$address_length < 3),]

# Add ", Manhattan" to each address
manhattan$ADDRESS <- paste(manhattan$ADDRESS, ", Manhattan", sep="")
geocoded_addresses <- geocode_OSM(manhattan$ADDRESS, as.data.frame = TRUE)
geocoded_addresses <- geocoded_addresses[c("query","lat", "lon")]
head(geocoded_addresses)
write.csv(geocoded_addresses,"geocoded_addresses.csv", row.names = FALSE)

Since running the above chunk took a very long time, I exported the dataframe output as a csv called “geocoded_addresses.csv”, so that I can use the csv whenever I need to re-run the markdown file without having to wait >= 5 minutes.

geocoded_addresses <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/geocoded_addresses.csv")
formattable(head(geocoded_addresses))
query lat lon
743 EAST 6TH STREET, Manhattan 40.72290 -73.97740
189 EAST 7TH STREET, Manhattan 40.72493 -73.98051
166 AVENUE A, Manhattan 40.72811 -73.98174
263 EAST 7TH STREET, Manhattan 40.72363 -73.97741
604 EAST 11TH STREET, Manhattan 40.72712 -73.97916
483 WEST 22ND STREET, Manhattan 40.74700 -74.00411

The geocode_OSM() function couldn’t find a few addresses that aren’t huge losses so I’ll ignore those lost rows and merge the geocoded_addresses df with my manhattan df. In the case, since manhattan has more rows than geocoded_addresses, I conduct an “inner join” with merge():

# Inner join 
manhattan <- merge(manhattan, geocoded_addresses, by.x = "ADDRESS", by.y = "query")
# Drop address_length column
manhattan <- manhattan[ , !(names(manhattan) %in% c("address_length"))]
formattable(head(manhattan))
ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon
1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031
106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056
109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867
11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185
11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506
110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 14.27621 120.96607

An entry has really suspicious lat and lon columns, as they’re both wildly different than the others:

##            manhattan[6, ]$ADDRESS manhattan[6, ]$lat manhattan[6, ]$lon
## 1 110 WASHINGTON PLACE, Manhattan           14.27621           120.9661

Using https://www.latlong.net/convert-address-to-lat-long.html, I found the right lat and lon and manually replaced the column values.

manhattan[6,]$lat <- 40.732422
manhattan[6,]$lon <- -74.001152

# Check the min and max of lat and lon that will also be used for mapping
height <- max(manhattan$lat) - min(manhattan$lat)
width <- max(manhattan$lon) - min(manhattan$lon)
borders <- c(bottom  = min(manhattan$lat)  - 0.2 * height,
             top     = max(manhattan$lat)  + 0.2 * height,
             left    = min(manhattan$lon) - 0.2 * width,
             right   = max(manhattan$lon) + 0.2 * width)
borders
##    bottom       top      left     right 
##  40.67401  40.91134 -74.03133 -73.88918

Mapping Manhattan homes

map <- get_stamenmap(borders, zoom = 11, maptype = "toner-lite")
ggmap(map) +
  geom_point(data = manhattan, mapping = aes(x = lon, y = lat,
  col = SALE.PRICE)) +
  scale_color_gradient(low = "yellow", high = "red")

I’ll be doing more data exploration once I merge the current manhattan df with the 6 other datasets. I just wanted to output one simple map before doing so. It seems that the most expensive homes are located around Central Park (Upper East Side, Upper West Side), Greenwich Village, and SoHo.

Merging 6 variable datasets

Schools

schools <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/schools.csv", stringsAsFactors = FALSE)
schools$LATITUDE <- as.numeric(schools$LATITUDE)
## Warning: NAs introduced by coercion
schools$LONGITUDE <- as.numeric(schools$LONGITUDE)
## Warning: NAs introduced by coercion

Remove rows with zeros for longitude and latitude

schools <-schools[!(schools$"LONGITUDE"== 0),]
schools <- schools[with(schools, order(LONGITUDE)),]

Filter for Manhattan schools

m_districts <- c(1,2,3,4,5,6,7,8,9,10)
schools <- schools[schools$Council.district %in% m_districts, ]

I found that filtering for council districts was the most accurate way of finding schools located in Manhattan. I initally tried using community district but the community district scale here is much different than the ones I found on the NYC.gov website. According to https://council.nyc.gov/districts/, Manhattan council districts are from 1 to 10 inclusive, though district 8 represents parts of Manhattan and the Bronx.

Next, I want to determine how far each school is from a home property, but for each property, I only care about schools within a certain proximity. Furthermore, since high schools aren’t subject to school zoning laws, distance betwen home and school is weighed more than for middle and elementary schools, which are assigned based on where students’ live. Although school zoning rules probably affect proprty sales (i.e. there’s a premium on property sales near the best public high school in Manhattan), I assume that more weight is put on attending good high schools rather than middle or elementary schools, since the former imapcts college acceptances and thus its quality is more important to parents and students.

Filter out schools that don’t serve grades 9-12

hs_grades <- c("09,10,11,12")
schools <- schools[schools$Grades_text %like% hs_grades, ]

Quick map of final schools

ggmap(map) +
  geom_point(data = schools, mapping = aes(x = LONGITUDE, y = LATITUDE))

Now that I have the high schools, I want to now know how many of these schools fall within a buffer of a Manhattan home. So, I have to first create buffers around each home, and I’ve decided to create 1609.34 meter (1 mile) buffers. To do so, I had to convert the manhattan df to an sf object with st_as_sf and also transform the original EPSG:4326 CRS to the 2263 projected coordinate system (NAD83/New York Long Island). I also created a column called home_ID that gives each row (home) a unique number. Ultimately, I get an sf object called manhattan_sf_buff that has two fields: home_ID and geometry; geometry is the buffer polygon.

Create 1609.34 meter buffers around homes

manhattan_copy <- copy(manhattan)
coordinates(manhattan_copy) <- c("lon", "lat")
proj4string(manhattan_copy) <- CRS("+init=epsg:4326")
manhattan_sf <- st_as_sf(manhattan_copy) %>% st_transform(2263) # nad83 
manhattan_sf_buff <- st_buffer(manhattan_sf, 1609.34)
manhattan_sf_buff$home_ID <- seq.int(nrow(manhattan_sf_buff))

manhattan_sf_buff <- subset(manhattan_sf_buff, select=c("home_ID", "geometry"))

formattable(head(as.data.frame(manhattan_sf_buff)))
home_ID geometry
1 POLYGON ((996855.6 215213.3…
2 POLYGON ((996782.7 221474.6…
3 POLYGON ((986228.4 207337.2…
4 POLYGON ((996430 214430.1, …
5 POLYGON ((987229.7 207089.4…
6 POLYGON ((985540.1 206117.7…

I go through a similar process of transforming the schools dataframe of coverting it into an sf object, re-projecting it to the EPSG:2263 projected coordinate system, and creating a column of unique, integer IDs. Ultimately, I get an sf object called schools_sf that has two fields: school_ID and geometry; geometry is of point geometry and represents each school.

schools_copy <- copy(schools)
coordinates(schools_copy) <- c("LONGITUDE", "LATITUDE")
proj4string(schools_copy) <- CRS("+init=epsg:4326")
schools_sf <- st_as_sf(schools_copy) %>% st_transform(2263)
schools_sf$school_ID <- seq.int(nrow(schools_sf))
schools_sf <- subset(schools_sf, select=c("school_ID", "geometry"))

formattable(head(as.data.frame(schools_sf)))
school_ID geometry
1079 1 POINT (978997.2 190477.9)
1080 2 POINT (978997.2 190477.9)
757 3 POINT (979410.2 196631.7)
1113 4 POINT (979859.1 196347.8)
1029 5 POINT (980391 200872.7)
911 6 POINT (980561.6 196377.9)

Plot of all homes and schools

plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(schools_sf), pch = 3, col = "Red", add= TRUE)

To calculate the intersections between the homes and high schools, I use the st_intersection() function from the sf library.

Calculate intersection

intersection <- st_intersection(x = manhattan_sf_buff, y = schools_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries

By plotting the homes and the intersections, we can see a clear decrease in the number of high schools, telling us that only certain schools are within 1609.34 meter buffers of a home.

Plot of all homes and only schools that intersect

plot(manhattan_sf_buff$geometry, col="Blue")
plot(intersection, col = "Red", pch = 3, add = TRUE)
## Warning in plot.sf(intersection, col = "Red", pch = 3, add = TRUE): ignoring all
## but the first attribute

Calculate the number of schools within the buffer of each home

int_result <- intersection %>% 
  group_by(home_ID) %>% 
  count()

formattable(head(int_result))
home_ID n geometry
3 2 POINT (985433.1 207572.8)
5 3 MULTIPOINT ((985433.1 20757…
8 1 POINT (993448.1 229696.9)
10 5 MULTIPOINT ((982544.1 20488…
11 4 MULTIPOINT ((998395 234567….
12 4 POINT (991316.2 225687.7)

Now we have to left join the manhattan dataframe with int_result. First, I created the same column of unique, integer ID for manhattan so that I can join the two dataframes with the shared home_ID column. After the merge, homes that don’t have schools within its buffer have NA values in the “n” column so I replace those with zeros and rename the column num_schools. I call this final dataframe manhattan_int_schools.

Left join manhattan with int_result

manhattan$home_ID <- seq.int(nrow(manhattan))
manhattan_int_schools <- merge(x = manhattan, y = int_result, by = "home_ID", all.x = TRUE)
manhattan_int_schools$n[is.na(manhattan_int_schools$n)] <- 0
colnames(manhattan_int_schools)[20] <- "num_schools"

formattable(head(manhattan_int_schools))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry
1 1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031 0 GEOMETRYCOLLECTION EMPTY
2 106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056 0 GEOMETRYCOLLECTION EMPTY
3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 POINT (985433.1 207572.8)
4 11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185 0 GEOMETRYCOLLECTION EMPTY
5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757…
6 110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 40.73242 -74.00115 0 GEOMETRYCOLLECTION EMPTY

Map of only homes and schools that intersect

# Schools that intersect with homes 
schools_that_int <- as.data.frame(st_transform(intersection, crs = 4326))
schools_that_int <- schools_that_int[!duplicated(schools_that_int[,c("school_ID")]),]

schools$school_ID <- seq.int(nrow(schools))

test2 <- merge(schools, schools_that_int)
formattable(head(test2)) 
school_ID fiscal_year system_code location_code location_name BEDS Managed_by_name location_type_description Location_Category_Description Grades_text Grades_final_text open_date Status_descriptions Primary_building_code primary_address_line_1 State_code X_COORDINATE Y_COORDINATE LONGITUDE LATITUDE Community_district Council.district Census_tract Borough_block_lot NTA NTA_Name Principal_Name Principal_title Principal_phone_number fax_number Geographical_District_code Administrative_District_Code Administrative_District_Name community_school_sup_name Tier_3_Support_Location_Name Tier_3_Support_Leader_Name Tier_2_Support_Location_Name HighSchool_Network_Location_Code HighSchool_Network_Name HighSchool_Network_Superintendent Community_district.1 Police_precinct home_ID geometry
5 2020 02M475 M475 Stuyvesant High School 310200011475 DOE General Academic High school 09,10,11,12 09,10,11,12 1996-06-05T00:00:00.000 Open M477 345 CHAMBERS STREET NY 980391 200873 -74.01392 40.71802 101 1 31703 1000160215 MN25 Battery Park City-Lower Manhattan Eric Contreras Principal 212-312-4800 212-587-3874 2 2 NULL NULL NYCDOE Borough Office - Manhattan CHU, YUET School Support Team 1- Manhattan HS04 HS Network 04 Superintendent Office ORLEN, VIVIAN 101 1 130 POINT (-74.01392 40.71802)
13 2020 75M721 M721 P.S. M721 - Manhattan Occupational Training Center 307500011721 DOE Special Education Secondary School 06,07,08,09,10,11,12,SE 09,10,11,12,SE 1990-12-04T00:00:00.000 Open M641 250 WEST HOUSTON STREET NY 982544 204890 -74.00616 40.72905 102 3 6700 1005810054 MN23 West Village SHOLOM FRIED Principal 212-675-7926 212-255-3227 2 75 CITYWIDE SPECIAL EDUCATION LOUISSAINT, KETLER D75 CITYWIDE BCO Tillman Roberto Children First Network 755 NULL NULL NULL 102 6 10 POINT (-74.00616 40.72905)
14 2020 75M721 M721 P.S. M721 - Manhattan Occupational Training Center 307500011721 DOE Special Education Secondary School 06,07,08,09,10,11,12,SE 09,10,11,12,SE 1990-12-04T00:00:00.000 Open M641 250 WEST HOUSTON STREET NY 982544 204890 -74.00616 40.72905 102 3 6700 1005810054 MN23 West Village SHOLOM FRIED Principal 212-675-7926 212-255-3227 2 75 CITYWIDE SPECIAL EDUCATION LOUISSAINT, KETLER D75 CITYWIDE BCO Tillman Roberto Children First Network 755 NULL NULL NULL 102 6 10 POINT (-74.00616 40.72905)
15 2020 02M376 M376 NYC iSchool 310200011376 DOE General Academic High school 09,10,11,12,SE 09,10,11,12 2008-07-01T00:00:00.000 Open M615 131 AVENUE OF THE AMERICAS NY 982848 203279 -74.00506 40.72463 102 3 3700 1004910016 MN24 SoHo-TriBeCa-Civic Center-Little Italy Isora Bailey Principal 917-237-7300 212-219-0743 2 2 NULL NULL NYCDOE Borough Office - Manhattan CHU, YUET School Support Team 1- Manhattan HS04 HS Network 04 Superintendent Office ORLEN, VIVIAN 102 1 10 POINT (-74.00506 40.72463)
16 2020 02M615 M615 Chelsea Career and Technical Education High School 310200011615 DOE Career Technical High school 09,10,11,12,SE 09,10,11,12 1905-07-01T00:00:00.000 Open M615 131 AVENUE OF THE AMERICAS NY 982848 203279 -74.00506 40.72463 102 3 3700 1004910016 MN24 SoHo-TriBeCa-Civic Center-Little Italy Jaivelle Reed Principal 212-925-1080 212-941-7934 2 2 NULL NULL NYCDOE Borough Office - Manhattan CHU, YUET School Support Team 1- Manhattan HS04 HS Network 04 Superintendent Office ORLEN, VIVIAN 102 1 10 POINT (-74.00506 40.72463)
17 2020 84M522 M522 Broome Street Academy Charter School 310200860992 Charter General Academic High school 09,10,11,12,SE 09,10,11,12 2011-07-01T00:00:00.000 Open MBDF 121 AVENUE OF THE AMERICAS NY 982950 203014 -74.00469 40.72391 102 3 3700 0 MN24 SoHo-TriBeCa-Civic Center-Little Italy Melissa Silberman Principal 212-453-0295 212-966-7253 2 84 OFFICE OF CHARTER SCHOOLS NULL NULL NULL NULL NULL NULL NULL 102 1 10 POINT (-74.00469 40.72391)
# Homes with at least 1 school in its buffer
test3 <- manhattan_int_schools[manhattan_int_schools$num_schools > 0,]
formattable(head(test3))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry
3 3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 POINT (985433.1 207572.8)
5 5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757…
8 8 115 MANHATTAN AVENUE, Manhattan MANHATTAN VALLEY 02 TWO FAMILY DWELLINGS 1840 152 10025 2 0 2 900 3,600 1899 1 B1 10 2/11/20 40.79783 -73.96233 1 POINT (993448.1 229696.9)
10 10 116 SULLIVAN STREET, Manhattan SOHO 02 TWO FAMILY DWELLINGS 504 29 10012 2 0 2 2,350 5,446 1900 1 B1 13450000 3/2/20 40.72613 -74.00286 5 MULTIPOINT ((982544.1 20488…
11 11 116 WEST 130TH STREET, Manhattan HARLEM-CENTRAL 03 THREE FAMILY DWELLINGS 1914 41 10027 3 0 3 1,665 3,476 1910 1 C0 2300000 4/19/19 40.81115 -73.94412 4 MULTIPOINT ((998395 234567….
12 12 116 WEST 80TH STREET, Manhattan UPPER WEST SIDE (79-96) 01 ONE FAMILY DWELLINGS 1210 142 10024 1 0 1 1,839 5,520 1910 1 A4 4675000 8/2/19 40.78288 -73.97604 4 POINT (991316.2 225687.7)
map2 <- get_stamenmap(borders, zoom = 11, maptype = "terrain")

ggmap(map2) +
  geom_point(data = test2, mapping = aes(x = LONGITUDE, y = LATITUDE), col="Red", shape=3) + # Schools
  geom_point(data = test3, mapping = aes(x = lon, y = lat), col="Blue") # Homes

Healthy Stores

stores <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/healthystores.csv", stringsAsFactors = FALSE)

Remove rows with zeros for longitude and latitude

stores <-stores[!is.na(stores$Longitude), ]

Filter for New York stores

stores <- stores[stores$Borough %like% "New York", ]

After removing stores without latitude and longitude values and aren’t located in the New York borough, I’m left with 68 stores.

Quick map of final stores

ggmap(map) +
  geom_point(data = stores, mapping = aes(x = Longitude, y = Latitude))

It seems that the dataset for grocery stores in New York are all clustered in the Upper East Side. My dataset for Manhattan homes, however, is spread throughout Manhattan so I expect that a lot of homes will have zero stores in their buffers. So, I decided to combine stores with data on farmers markets.

The farmers market data, as shown below, is spread out more evently than stores.

Clean farmers markets data and map it

farmers <- read.csv("https://raw.githubusercontent.com/lcao21/GIS3_Final_Project/master/Data/farmersmarkets.csv", stringsAsFactors = FALSE)
farmers <-farmers[!is.na(farmers$Longitude), ]
farmers <- farmers[farmers$Borough %like% "Manhattan", ]

ggmap(map) +
  geom_point(data = farmers, mapping = aes(x = Longitude, y = Latitude))

The final merged dataframe is called food and has four columns: name, street_address, latitude, and longitude.

Combine stores and farmers dataframes

food_names <- c(stores$Store.Name, farmers$Market.Name)

stores$street_address <- paste(stores$Street.Address, stores$Address, sep=" ")
food_addresses <- c(stores$street_address, farmers$Street.Address)

food_lat <- c(stores$Latitude, farmers$Latitude)

food_lon <- c(stores$Longitude, farmers$Longitude)

food <- data.frame("name" = food_names, "street_address" = food_addresses, "latitude" = food_lat, "longitude" = food_lon) 

Quick map of healthy grocery vendors

ggmap(map) +
  geom_point(data = food, mapping = aes(x = longitude, y = latitude))

Since I already created an sf object with 1609.34 meter buffers for each home–manhattan_sf_buff, I can use it again here. Now, I have to transform the food dataframe into an sf object so that I can use st_intersection() between the two.

foods_copy <- copy(food)
coordinates(foods_copy) <- c("longitude", "latitude")
proj4string(foods_copy) <- CRS("+init=epsg:4326")
food_sf <- st_as_sf(foods_copy) %>% st_transform(2263)
food_sf$food_ID <- seq.int(nrow(food_sf))
food_sf <- subset(food_sf, select=c("food_ID", "geometry"))

formattable(head(as.data.frame(food_sf)))
food_ID geometry
1 POINT (999869.1 230322.1)
2 POINT (1000307 230058.2)
3 POINT (1000599 228063.3)
4 POINT (1000611 228095)
5 POINT (1001628 229567.3)
6 POINT (1000484 227856.3)

Plot of all homes and food vendors

plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(food_sf), pch = 3, col = "Green", add= TRUE)

Calculate intersection

food_intersection <- st_intersection(x = manhattan_sf_buff, y = food_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries

Plot of all homes and only food vendors that intersect

plot(manhattan_sf_buff$geometry, col="Blue")
plot(food_intersection, col = "Green", pch = 3, add = TRUE)
## Warning in plot.sf(food_intersection, col = "Green", pch = 3, add = TRUE):
## ignoring all but the first attribute

Calculate the number of food vendors within the buffer of each home

food_int_result <- food_intersection %>% 
  group_by(home_ID) %>% 
  count()

formattable(head(food_int_result))
home_ID n geometry
5 1 POINT (986930.1 207834.2)
8 1 POINT (995422.1 231161.9)
9 6 MULTIPOINT ((997335.9 22679…
11 2 MULTIPOINT ((998555.1 23401…
12 1 POINT (990898 223916.9)
19 10 MULTIPOINT ((997335.9 22679…

Now I left joined manhattan_int_schools with food_int_result, using home_ID as the mutual column, and renamed the dataframe manhattan_ints because all the dependent variable columns will be in here or eventually merged into it. For rows that have NA values for the column of food vendor counts, I replaced them with zeros.

manhattan_ints <- merge(x = manhattan_int_schools, y = food_int_result, by = "home_ID", all.x = TRUE)
manhattan_ints$n[is.na(manhattan_ints$n)] <- 0
colnames(manhattan_ints)[22] <- "num_food"

formattable(head(manhattan_ints))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry.x num_food geometry.y
1 1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY
2 106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY
3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 POINT (985433.1 207572.8) 0 GEOMETRYCOLLECTION EMPTY
4 11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY
5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757… 1 POINT (986930.1 207834.2)
6 110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 40.73242 -74.00115 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY

Map of only homes and food vendors that intersect

# Food vendors that intersect with homes 
food_that_int <- as.data.frame(st_transform(food_intersection, crs = 4326))
food_that_int <- food_that_int[!duplicated(food_that_int[,c("food_ID")]),]

food$food_ID <- seq.int(nrow(food))

food_merged <- merge(food, food_that_int)
formattable(head(food_merged)) 
food_ID name street_address latitude longitude home_ID geometry
1 Comidas Dominicanas Capellan Deli 1626 Park Avenue 40.79884 -73.94359 156 POINT (-73.94359 40.79884)
5 Juice Bar Deli 308 East 116th Street 40.79677 -73.93724 93 POINT (-73.93724 40.79677)
9 El Tepeyac Grocery 1621 Lexington Avenue 40.78992 -73.94796 19 POINT (-73.94796 40.78992)
13 JJ Food  Market 153 East 99th Street 40.78769 -73.94923 9 POINT (-73.94923 40.78769)
14 Deli Mini Mart 339 East 115th Street 40.79565 -73.93670 93 POINT (-73.9367 40.79565)
15 Cherry Deli Gourmet 1575 Lexington Avenue 40.78874 -73.94882 9 POINT (-73.94882 40.78874)
# Homes with at least 1 food vendor in its buffer
homes_with_food <- manhattan_ints[manhattan_ints$num_food > 0,]
formattable(head(homes_with_food))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry.x num_food geometry.y
5 5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757… 1 POINT (986930.1 207834.2)
8 8 115 MANHATTAN AVENUE, Manhattan MANHATTAN VALLEY 02 TWO FAMILY DWELLINGS 1840 152 10025 2 0 2 900 3,600 1899 1 B1 10 2/11/20 40.79783 -73.96233 1 POINT (993448.1 229696.9) 1 POINT (995422.1 231161.9)
9 9 116 EAST 95TH STREET, Manhattan UPPER EAST SIDE (79-96) 01 ONE FAMILY DWELLINGS 1523 68 10128 1 0 1 2,013 4,160 1899 1 A9 8000000 3/20/20 40.78548 -73.95259 0 GEOMETRYCOLLECTION EMPTY 6 MULTIPOINT ((997335.9 22679…
11 11 116 WEST 130TH STREET, Manhattan HARLEM-CENTRAL 03 THREE FAMILY DWELLINGS 1914 41 10027 3 0 3 1,665 3,476 1910 1 C0 2300000 4/19/19 40.81115 -73.94412 4 MULTIPOINT ((998395 234567…. 2 MULTIPOINT ((998555.1 23401…
12 12 116 WEST 80TH STREET, Manhattan UPPER WEST SIDE (79-96) 01 ONE FAMILY DWELLINGS 1210 142 10024 1 0 1 1,839 5,520 1910 1 A4 4675000 8/2/19 40.78288 -73.97604 4 POINT (991316.2 225687.7) 1 POINT (990898 223916.9)
19 19 1255 PARK AVENUE, Manhattan HARLEM-EAST 02 TWO FAMILY DWELLINGS 1625 1 10029 2 0 2 847 1,650 1920 1 B1 906500 2/7/20 40.78720 -73.95172 0 GEOMETRYCOLLECTION EMPTY 10 MULTIPOINT ((997335.9 22679…
ggmap(map2) +
  geom_point(data = food_merged, mapping = aes(x = longitude, y = latitude), col="Green", shape=3) + # Food 
  geom_point(data = homes_with_food, mapping = aes(x = lon, y = lat), col="Blue") # Homes

I repeat the whole process again but for the parks dataset this time. Unlike schools, stores, and farmers’ markets, however, parks aren’t represent as points but rather as polygons, and I loaded them in as geojson file.

Parks

parks <-geojson_sf("parks.geojson")
parks$park_ID <- seq.int(nrow(parks))
formattable(head(as.data.frame(parks)))
system shape_leng shape_area parknum landuse park_name sub_code feat_code status source_id geometry park_ID
NA 829.420106841 41539.8013629 B021 Community Park Commodore Barry Park 491050 4910 Updated 21491000001 MULTIPOLYGON (((-73.97922 4… 1
NA 607.555193338 21960.4949197 B222 Neighborhood Park Pierrepont Playground 498000 4980 Unchanged 21498000002 MULTIPOLYGON (((-73.99732 4… 2
NA 784.42199003 28079.0762647 B326 Neighborhood Park Cobble Hill Park 498000 4980 Unchanged 21498000003 MULTIPOLYGON (((-73.99557 4… 3
NA 2672.71014961 438416.114511 B021 Community Park Commodore Barry Park 498000 4980 Unchanged 21498000004 MULTIPOLYGON (((-73.97747 4… 4
NA 1376.10639843 14465.699799 B223DG Triangle/Plaza Brooklyn Heights Promenade 498000 4980 Unchanged 21498000005 MULTIPOLYGON (((-73.99747 4… 5
NA 1218.55982108 54510.4377633 B223K Parkway Trinity Park 498000 4980 Unchanged 21498000015 MULTIPOLYGON (((-73.98416 4… 6

There isn’t a column for borough, so I can’t filter for only parks in Manhattan. Nevertheless, this shouldn’t be a problem once I calculate the intersections. First, I have to re-project the parks data.

Re-project parks so that it has the same crs (2263) as the homes

parks_copy <- copy(parks)
parks_sf <- st_transform(parks_copy, crs = 2263)

Plot all homes and parks

plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(parks_sf), col = "darkseagreen1", add= TRUE)

Calculate intersection

parks_intersection <- st_intersection(x = manhattan_sf_buff, y = parks_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries

Plot of all homes and only parks that intersect

plot(manhattan_sf_buff$geometry, col="Blue")
plot(parks_intersection, col = "darkseagreen1", add = TRUE)
## Warning in plot.sf(parks_intersection, col = "darkseagreen1", add = TRUE):
## ignoring all but the first attribute

Calculate the number of parks within the buffer of each home

parks_int_result <- parks_intersection %>% 
  group_by(home_ID) %>% 
  count()

formattable(head(parks_int_result))
home_ID n geometry
1 18 MULTIPOLYGON (((994393.8 21…
2 14 MULTIPOLYGON (((994251.7 22…
3 20 MULTIPOLYGON (((985002 2057…
4 13 MULTIPOLYGON (((993976 2134…
5 5 MULTIPOLYGON (((985441.6 20…
6 41 MULTIPOLYGON (((983626.9 20…

The counts seem really high but if we look at parks_intersection, we see that some of the parks are being counted multiple times for the same parks. This is because the original dataset assigns different shape_area, system, source_id, parknum, and shape_leng columns to some of the same parks. So even after applying the unique() function, I get the same rows.

nrow(as.data.frame(unique(parks_intersection))) == nrow(as.data.frame(parks_intersection))
## [1] TRUE

I add the counts of parks that intersect with home buffers to the main dataframe: manhattan_ints.

Left join manhattan_ints with parks_int_result

manhattan_ints <- merge(x = manhattan_ints, y = parks_int_result, by = "home_ID", all.x = TRUE)
manhattan_ints$n[is.na(manhattan_ints$n)] <- 0
colnames(manhattan_ints)[24] <- "num_parks"
formattable(head(manhattan_ints))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry.x num_food geometry.y num_parks geometry
1 1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 18 MULTIPOLYGON (((994393.8 21…
2 106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 14 MULTIPOLYGON (((994251.7 22…
3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 POINT (985433.1 207572.8) 0 GEOMETRYCOLLECTION EMPTY 20 MULTIPOLYGON (((985002 2057…
4 11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 13 MULTIPOLYGON (((993976 2134…
5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757… 1 POINT (986930.1 207834.2) 5 MULTIPOLYGON (((985441.6 20…
6 110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 40.73242 -74.00115 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 41 MULTIPOLYGON (((983626.9 20…

Subway Stations

Like parks, I read in the data for subway stations as a geojson file.

subway <- geojson_sf("subway.geojson")
subway$subway_ID <- seq.int(nrow(subway))
formattable(head(as.data.frame(subway)))
notes url name line objectid geometry subway_ID
4 nights, 6-all times, 6 Express-weekdays AM southbound, PM northbound http://web.mta.info/nyct/service/ Astor Pl 4-6-6 Express 1 POINT (-73.99107 40.73005) 1
4 nights, 6-all times, 6 Express-weekdays AM southbound, PM northbound http://web.mta.info/nyct/service/ Canal St 4-6-6 Express 2 POINT (-74.00019 40.7188) 2
1-all times, 2-nights http://web.mta.info/nyct/service/ 50th St 1-2 3 POINT (-73.98385 40.76173) 3
4-nights, 3-all other times, 2-all times http://web.mta.info/nyct/service/ Bergen St 2-3-4 4 POINT (-73.975 40.68086) 4
4-nights, 3-all other times http://web.mta.info/nyct/service/ Pennsylvania Ave 3-4 5 POINT (-73.89489 40.66471) 5
1-all times, exit only northbound http://web.mta.info/nyct/service/ 238th St 1 6 POINT (-73.90087 40.88467) 6

Re-project subway so that it has the same crs (2263) as the homes

subway_copy <- copy(subway)
subway_sf <- st_transform(subway_copy, crs = 2263)

There isn’t a borough column, so I can’t filter for subway stations only located in Manhattan, but this isn’t an issue.

Plot all homes and subway stations

plot(manhattan_sf_buff$geometry, col="Blue")
plot(st_geometry(subway_sf), col = "orange", pch = 3, add= TRUE)

Calculate intersection

subway_intersection <- st_intersection(x = manhattan_sf_buff, y = subway_sf)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries

Plot of all homes and only parks that intersect

plot(manhattan_sf_buff$geometry, col="Blue")
plot(subway_intersection, col = "orange", pch = 3, add = TRUE)
## Warning in plot.sf(subway_intersection, col = "orange", pch = 3, add = TRUE):
## ignoring all but the first attribute

Calculate the number of subway stations within the buffer of each home

subway_int_result <- subway_intersection %>% 
  group_by(home_ID) %>% 
  count()

formattable(head(subway_int_result))
home_ID n geometry
2 1 POINT (995363.9 221130)
3 6 MULTIPOINT ((983444.6 20648…
5 5 MULTIPOINT ((984873.3 20805…
6 3 MULTIPOINT ((983444.6 20648…
7 1 POINT (995363.9 221130)
8 2 MULTIPOINT ((994945.9 22930…

I add the column of counts of subway stations that intersect with home buffers to the main dataframe: manhattan_ints.

Left join manhattan_ints with subway_int_result

manhattan_ints <- merge(x = manhattan_ints, y = subway_int_result, by = "home_ID", all.x = TRUE)
## Warning in merge.data.frame(x = manhattan_ints, y = subway_int_result, by =
## "home_ID", : column names 'geometry.x', 'geometry.y' are duplicated in the
## result
manhattan_ints$n[is.na(manhattan_ints$n)] <- 0
colnames(manhattan_ints)[26] <- "num_subway"
formattable(head(manhattan_ints))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry.x num_food geometry.y num_parks num_subway
1 1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 18 0
2 106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 14 1
3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 POINT (985433.1 207572.8) 0 GEOMETRYCOLLECTION EMPTY 20 6
4 11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 13 0
5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757… 1 POINT (986930.1 207834.2) 5 5
6 110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 40.73242 -74.00115 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 41 3

Final Dataframe

Before I go onto feature selection, I want to look at my “final” dataframe: manhattan_ints.

formattable(head(manhattan_ints))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools geometry.x num_food geometry.y num_parks num_subway
1 1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 18 0
2 106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 14 1
3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 POINT (985433.1 207572.8) 0 GEOMETRYCOLLECTION EMPTY 20 6
4 11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 13 0
5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 MULTIPOINT ((985433.1 20757… 1 POINT (986930.1 207834.2) 5 5
6 110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 40.73242 -74.00115 0 GEOMETRYCOLLECTION EMPTY 0 GEOMETRYCOLLECTION EMPTY 41 3

I don’t need the geometry columns so I drop them and am left with the columns I care about. Specifically, all my variables come from this dataframe:

Dependent Variable: SALE.PRICE Independent Variables: num_schools, num_food, num_parks, num_subway, RESIDENTIAL.UNITS, COMMERCIAL.UNITS, LAND.SQUARE.FEET, GROSS.SQUARE.FEET, NEIGHBORHOOD

manhattan_ints <- manhattan_ints[ , !(names(manhattan_ints) %like% "geometry")]

I have to transform the NEIGHBORHOOD column into an ordinal, categorical variable and will do so by ranking the neighborhoods by income. This is because the high median income in a neighborhood typically indicates high property prices and thus rich neighborhoods have a premium on home prices.

unique(manhattan_ints$NEIGHBORHOOD)
##  [1] "MIDTOWN EAST"              "UPPER EAST SIDE (59-79)"  
##  [3] "GREENWICH VILLAGE-WEST"    "HARLEM-CENTRAL"           
##  [5] "GREENWICH VILLAGE-CENTRAL" "MANHATTAN VALLEY"         
##  [7] "UPPER EAST SIDE (79-96)"   "SOHO"                     
##  [9] "UPPER WEST SIDE (79-96)"   "INWOOD"                   
## [11] "EAST VILLAGE"              "HARLEM-EAST"              
## [13] "GRAMERCY"                  "UPPER WEST SIDE (59-79)"  
## [15] "WASHINGTON HEIGHTS LOWER"  "MURRAY HILL"              
## [17] "CHELSEA"                   "TRIBECA"                  
## [19] "SOUTHBRIDGE"               "ALPHABET CITY"            
## [21] "WASHINGTON HEIGHTS UPPER"  "KIPS BAY"                 
## [23] "HARLEM-UPPER"

According to https://ny.curbed.com/2017/8/4/16099252/new-york-neighborhood-affordability, the neighborhoods with the highest to lowest median household income are: 1) Upper East Side (59-79); Upper East Side (79-96) 2) Soho; Tribeca 3) Midtown East 4) Greenwich Village Central; Greenwich Village West 5) Chelsea 6) Gramercy 7) Murray Hill; Kips Bay 8) Upper West Side (59-79); Upper West Side (79-96); Manhattan Valley 9) East Village; Alphabet City 10) Washington Heights Upper 11) Harlem Central; Harlem Upper 12) Inwood 13) Washington Heights Lower 14) Southbridge 15) Harlem East

As such, I’ll assign each level a number, with 15 being the highest income and 1 being the lowest income (the reverse of the above list).

manhattan_ints$neighborhood_ord <- 
  ifelse(manhattan_ints$NEIGHBORHOOD %like% "UPPER EAST SIDE", 15,
         ifelse(manhattan_ints$NEIGHBORHOOD %in% c("SOHO", "TRIBECA"), 14,
                ifelse(manhattan_ints$NEIGHBORHOOD %like% "MIDTOWN EAST", 13,
                       ifelse(manhattan_ints$NEIGHBORHOOD %like% "GREENWICH VILLAGE-", 12,
                              ifelse(manhattan_ints$NEIGHBORHOOD %like% "CHELSEA", 11,
                                     ifelse(manhattan_ints$NEIGHBORHOOD %like% "GRAMERCY", 10,
                                            ifelse(manhattan_ints$NEIGHBORHOOD %in% c("MURRAY HILL", "KIPS BAY"), 9,
                                                                                      ifelse(manhattan_ints$NEIGHBORHOOD %in% c("UPPER WEST SIDE (59-79)", "UPPER WEST SIDE (79-96)", "MANHATTAN VALLEY"), 8,
                                                                                             ifelse(manhattan_ints$NEIGHBORHOOD %in% c("EAST VILLAGE", "ALPHABET CITY"), 7,
                                                                                                     ifelse(manhattan_ints$NEIGHBORHOOD %like% "WASHINGTON HEIGHTS UPPER", 6,
                                                                                                            ifelse(manhattan_ints$NEIGHBORHOOD %in% c("HARLEM-CENTRAL", "HARLEM-UPPER"), 5,
                                                                                                                    ifelse(manhattan_ints$NEIGHBORHOOD %like% "INWOOD", 4,
                                                                                                                           ifelse(manhattan_ints$NEIGHBORHOOD %like% "WASHINGTON HEIGHTS LOWER", 3,
                                                                                                                                  ifelse(manhattan_ints$NEIGHBORHOOD %like% "SOUTHBRIDGE", 2,
                                                                                                                                  ifelse(manhattan_ints$NEIGHBORHOOD %like% "HARLEM-EAST", 1, 0)))))))))))))))
formattable(head(manhattan_ints))
home_ID ADDRESS NEIGHBORHOOD BUILDING.CLASS.CATEGORY BLOCK LOT ZIP.CODE RESIDENTIAL.UNITS COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE SALE.PRICE SALE.DATE lat lon num_schools num_food num_parks num_subway neighborhood_ord
1 1 SUTTON PLACE, Manhattan MIDTOWN EAST 01 ONE FAMILY DWELLINGS 1372 26 10022 1 0 1 1,434 5,339 1920 1 A7 13000000 10/8/19 40.75738 -73.96031 0 0 18 0 13
2 106 EAST 78TH STREET, Manhattan UPPER EAST SIDE (59-79) 01 ONE FAMILY DWELLINGS 1412 68 10075 1 1 2 1,839 4,344 1899 1 S1 7200000 6/10/19 40.77457 -73.96056 0 0 14 1 15
3 109 WEST 11TH STREET, Manhattan GREENWICH VILLAGE-WEST 01 ONE FAMILY DWELLINGS 607 50 10011 1 0 1 1,665 4,060 1899 1 A4 11200000 5/22/19 40.73577 -73.99867 2 0 20 6 12
4 11 EAST 129 STREET, Manhattan HARLEM-CENTRAL 01 ONE FAMILY DWELLINGS 1754 8 10035 1 0 1 1,081 2,800 1905 1 A4 1550000 8/22/19 40.75523 -73.96185 0 0 13 0 5
5 11 WEST 12TH STREET, Manhattan GREENWICH VILLAGE-CENTRAL 01 ONE FAMILY DWELLINGS 576 48 10011 1 0 1 2,581 8,125 1910 1 A4 19900000 6/14/19 40.73509 -73.99506 3 1 5 5 12
6 110 WASHINGTON PLACE, Manhattan GREENWICH VILLAGE-WEST 03 THREE FAMILY DWELLINGS 592 14 10014 3 0 3 1,575 2,646 1899 1 C0 6800000 4/15/19 40.73242 -74.00115 0 0 41 3 12

So now,

Dependent Variable: SALE.PRICE

Independent Variables: num_schools, num_food, num_parks, num_subway, RESIDENTIAL.UNITS, COMMERCIAL.UNITS, LAND.SQUARE.FEET, GROSS.SQUARE.FEET, neighborhood_ord

Next, I have to perform feature selection and will show my steps in the next document.